In [22]:
    
import logging
import numpy as np
import pandas as pd
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
import datetime
%matplotlib inline
from shapely.prepared import prep
from shapely import speedups
speedups.enable()
    
In [ ]:
    
import pandas as pd
important_columns1 = ['species', 'dateidentified', 'eventdate', 'basisofrecord', 'decimallatitude','decimallongitude', 'day', 'month', 'year' ]
result_with_lat_long = pd.DataFrame(columns=important_columns1)
counter = 0
for df in pd.read_msgpack("../data/fish/selection/merged.msg", iterator=True):
    counter += 1
    if (counter%100==0):
        print("%s Processing.. %s " % (datetime.datetime.now().time().isoformat(), counter))
    if "decimallatitude" in df.columns.tolist() and "decimallongitude" in df.columns.tolist():
        common_columns = list(set(important_columns1).intersection(set(df.columns.tolist())))
        result_with_lat_long = result_with_lat_long.append(df[common_columns], ignore_index=True)
    
    
In [ ]:
    
result_with_lat_long = result_with_lat_long[result_with_lat_long.decimallatitude.notnull() & result_with_lat_long.decimallongitude.notnull()]
    
In [ ]:
    
result_with_lat_long['species'].unique().size
    
Best to take into account all observations which have either "year" or "eventdate" present. (or both) Let's group them by species name, and count the number of observation records.
In [ ]:
    
grouped_lat_long_year_or_eventdate = pd.DataFrame()
grouped_lat_long_year_or_eventdate['count'] = result_with_lat_long[result_with_lat_long.eventdate.notnull() | result_with_lat_long.year.notnull()].groupby(['species']).apply(lambda x: x['species'].count())
grouped_lat_long_year_or_eventdate.head(10) # peak at the top 10 only
    
In [ ]:
    
result_with_lat_long['species'].unique().size
    
In [ ]:
    
year_or_eventdate_1990 = result_with_lat_long[['species', 'year', 'eventdate', 'basisofrecord', 'decimallatitude', 'decimallongitude']][(result_with_lat_long.year>1990) | (result_with_lat_long.eventdate>"1990")]
grouped_year_or_eventdate_1990 = pd.DataFrame()
grouped_year_or_eventdate_1990['numobservations'] = year_or_eventdate_1990.groupby(['species']).apply(lambda x: x['species'].count())
grouped_year_or_eventdate_1990.shape[0]
    
In [ ]:
    
year_or_eventdate_1990.basisofrecord.unique()
    
I guess we should keep only observations of type 'OBSERVATION', 'MACHINE_OBSERVATION' and 'HUMAN_OBSERVATION'?
In [ ]:
    
final_selection = year_or_eventdate_1990[(year_or_eventdate_1990.basisofrecord=='OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='HUMAN_OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='MACHINE_OBSERVATION')]
    
In [ ]:
    
final_selection.species.unique().shape
    
In [ ]:
    
final_selection
    
In [ ]:
    
from iSDM.species import GBIFSpecies
    
In [ ]:
    
all_species = GBIFSpecies(name_species='All')
    
In [ ]:
    
all_species.set_data(final_selection)
    
In [ ]:
    
all_species.get_data().species.unique().shape # these many different species
    
In [ ]:
    
all_species.get_data().shape[0] # 1939675? this many observations satisfying our criteria (after 1990, with the correct observation type)
    
In [ ]:
    
year_or_eventdate_1990.shape[0] # before filtering out types of observations
    
In [ ]:
    
all_species.geometrize()
    
In [ ]:
    
all_species.get_data().species.unique().shape
    
In [ ]:
    
final_observations = all_species.get_data()[['species', 'year','eventdate', 'basisofrecord','geometry']]
    
In [ ]:
    
# final_observations.to_file("../data/bias_grid/final_observations", driver="ESRI Shapefile")
    
In [17]:
    
from geopandas import GeoDataFrame
final_observations = GeoDataFrame.from_file("../data/bias_grid/final_observations/")
    
In [18]:
    
final_observations.head()
    
    Out[18]:
30 arcsec = 0.5/60 pixel = 0.0083333333 'height': 21600, 'width': 43200 for a global map
Generate 2D array and use it as a basis for bias grid.
In [19]:
    
x_min, y_min, x_max, y_max = -180, -90, 180, 90
pixel_size = 0.083333333 # 0.0083333333
x_res = int((x_max - x_min) / pixel_size)
y_res = int((y_max - y_min) / pixel_size)
    
In [20]:
    
x_res
    
    Out[20]:
In [21]:
    
y_res
    
    Out[21]:
We will use a memory-mapped biasgrid file to prevent the RAM from exploding. At least while results are computed.
In [23]:
    
bias_grid_mm = np.memmap("../data/bias_grid/bias_grid_mm_5arcmin.dat", dtype='int32', mode='w+', shape=(y_res,x_res))
    
In [24]:
    
def increase_pixel(row):
    bias_grid_mm[np.abs(int((row.y - 90) / pixel_size)),
              np.abs(int((row.x + 180) / pixel_size))]+=1
    
In [25]:
    
here = final_observations.geometry.apply(lambda row: increase_pixel(row)) # keeps the memory usage stable
    
In [26]:
    
bias_grid_mm.max()
    
    Out[26]:
In [13]:
    
bias_grid_mm.std() # this still eats memory (uses intermediate datastructures, of course)
    
    Out[13]:
In [27]:
    
bias_grid_mm.sum()
    
    Out[27]:
In [28]:
    
# check if the number of all observations is equal to the bias_grid sum of observations
bias_grid_mm.sum() == final_observations.shape[0]
    
    Out[28]:
In [29]:
    
bias_grid_mm.flush()
    
In [19]:
    
del bias_grid_mm
    
In [ ]:
    
# to read it, the following is used:
    
In [16]:
    
bias_grid_mm = np.memmap("../data/bias_grid/bias_grid_mm.dat", dtype='int32', mode='r', shape=(y_res,x_res))
    
In [5]:
    
bias_grid_mm.sum()
    
    Out[5]:
In [6]:
    
import gc
gc.collect()
    
    Out[6]:
In [30]:
    
import rasterio
from rasterio.transform import Affine
transform = Affine.translation(x_min, y_max) * Affine.scale(pixel_size, -pixel_size)
crs = {'init': "EPSG:4326"}
    
In [31]:
    
transform
    
    Out[31]:
In [32]:
    
with rasterio.open("../data/bias_grid/bias_grid_5arcmin.tif", 'w', driver='GTiff', width=x_res, height=y_res,
                   count=1,
                   dtype=np.uint32,
                   nodata=0,
                   transform=transform,
                   crs=crs) as out:
    out.write(bias_grid_mm.astype(np.uint32), indexes=1)
    out.close()
    
In [33]:
    
bias_grid_mm.shape
    
    Out[33]:
In [ ]: